Here you can enter your name to register as a user in the notebook. Once you have done it, you can add your personal paths in the "Paths" section of the notebook.
user = "samuel" #Values : marius || samuel
In this section we load all the libraries we need for the notebook to be working properly. Here is the extensive list of utilized libraries :
Data Analysis libraries :
Visualization libraries :
Machine and Deep Learning libraries :
Scrapping libraries :
Api libraries
Date Storage related libraries :
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.colors as plt_colors
import matplotlib.cm as cm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from textwrap import wrap
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ARDRegression
from sklearn.model_selection import cross_validate,learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from yellowbrick.model_selection import LearningCurve
!pip install auto-sklearn
import autosklearn
import autosklearn.classification
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: auto-sklearn in /usr/local/lib/python3.8/dist-packages (0.15.0) Requirement already satisfied: scikit-learn<0.25.0,>=0.24.0 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (0.24.2) Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.7.3) Requirement already satisfied: pyrfr<0.9,>=0.8.1 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (0.8.3) Requirement already satisfied: distro in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.8.0) Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.21.6) Requirement already satisfied: smac<1.3,>=1.2 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.2) Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.3.5) Requirement already satisfied: liac-arff in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (2.5.0) Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (1.2.0) Requirement already satisfied: distributed>=2012.12 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (2022.2.0) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (4.1.1) Requirement already satisfied: ConfigSpace<0.5,>=0.4.21 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (0.4.21) Requirement already satisfied: pynisher<0.7,>=0.6.3 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (0.6.4) Requirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (57.4.0) Requirement already satisfied: dask>=2021.12 in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (2022.2.0) Requirement already satisfied: pyyaml in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (6.0) Requirement already satisfied: threadpoolctl in /usr/local/lib/python3.8/dist-packages (from auto-sklearn) (3.1.0) Requirement already satisfied: cython in /usr/local/lib/python3.8/dist-packages (from ConfigSpace<0.5,>=0.4.21->auto-sklearn) (0.29.32) Requirement already satisfied: pyparsing in /usr/local/lib/python3.8/dist-packages (from ConfigSpace<0.5,>=0.4.21->auto-sklearn) (3.0.9) Requirement already satisfied: partd>=0.3.10 in /usr/local/lib/python3.8/dist-packages (from dask>=2021.12->auto-sklearn) (1.3.0) Requirement already satisfied: cloudpickle>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from dask>=2021.12->auto-sklearn) (1.5.0) Requirement already satisfied: toolz>=0.8.2 in /usr/local/lib/python3.8/dist-packages (from dask>=2021.12->auto-sklearn) (0.12.0) Requirement already satisfied: fsspec>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from dask>=2021.12->auto-sklearn) (2022.11.0) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from dask>=2021.12->auto-sklearn) (21.3) Requirement already satisfied: psutil>=5.0 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (5.4.8) Requirement already satisfied: tornado>=6.0.3 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (6.0.4) Requirement already satisfied: msgpack>=0.6.0 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (1.0.4) Requirement already satisfied: tblib>=1.6.0 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (1.7.0) Requirement already satisfied: click>=6.6 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (7.1.2) Requirement already satisfied: jinja2 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (2.11.3) Requirement already satisfied: zict>=0.1.3 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (2.2.0) Requirement already satisfied: sortedcontainers!=2.0.0,!=2.0.1 in /usr/local/lib/python3.8/dist-packages (from distributed>=2012.12->auto-sklearn) (2.4.0) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=1.0->auto-sklearn) (2022.6) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=1.0->auto-sklearn) (2.8.2) Requirement already satisfied: locket in /usr/local/lib/python3.8/dist-packages (from partd>=0.3.10->dask>=2021.12->auto-sklearn) (1.0.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->auto-sklearn) (1.15.0) Requirement already satisfied: emcee>=3.0.0 in /usr/local/lib/python3.8/dist-packages (from smac<1.3,>=1.2->auto-sklearn) (3.1.3) Requirement already satisfied: heapdict in /usr/local/lib/python3.8/dist-packages (from zict>=0.1.3->distributed>=2012.12->auto-sklearn) (1.0.1) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.8/dist-packages (from jinja2->distributed>=2012.12->auto-sklearn) (2.0.1)
import plotly.express as px
from collections import OrderedDict
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, transform
output_notebook()
from bokeh.palettes import Spectral5
from bokeh.palettes import PRGn7
from bokeh.palettes import Magma7
from bokeh.palettes import Viridis
from bokeh.palettes import Pastel2_7
from bokeh.palettes import Purples9
from bokeh.palettes import Greens9
from bokeh.palettes import YlOrRd9
from bokeh.palettes import Magma4
from bokeh.palettes import Magma6
from bokeh.palettes import Magma7
from bokeh.palettes import Magma10
from bokeh.palettes import Magma
from wordcloud import WordCloud
from PIL import Image
from missingno import matrix
!pip install bar_chart_race
import bar_chart_race as bcr
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting bar_chart_race Downloading bar_chart_race-0.1.0-py3-none-any.whl (156 kB) |████████████████████████████████| 156 kB 15.3 MB/s Requirement already satisfied: matplotlib>=3.1 in /usr/local/lib/python3.8/dist-packages (from bar_chart_race) (3.2.2) Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.8/dist-packages (from bar_chart_race) (1.3.5) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=3.1->bar_chart_race) (2.8.2) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=3.1->bar_chart_race) (3.0.9) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=3.1->bar_chart_race) (0.11.0) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=3.1->bar_chart_race) (1.21.6) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib>=3.1->bar_chart_race) (1.4.4) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas>=0.24->bar_chart_race) (2022.6) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.1->matplotlib>=3.1->bar_chart_race) (1.15.0) Installing collected packages: bar-chart-race Successfully installed bar-chart-race-0.1.0
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dropout
from keras.utils import to_categorical
import warnings
import pickle
import flask
from flask import Flask, request
This section allows you to save the paths of your datasets and images.
vecto_path_marius = r"/content/drive/MyDrive/SWITCH/Python for Data Analysis/Projet Final/DataFrames/GoogleNews-vectors-negative300.bin"
online_news_popularity_path_marius = r"/content/drive/MyDrive/SWITCH/Python for Data Analysis/Projet Final/DataFrames/OnlineNewsPopularityWithAutorsAndTitles.csv"
multi_timeline_path_marius = r"/content/drive/MyDrive/SWITCH/Python for Data Analysis/Projet Final/DataFrames/multiTimeline.csv"
mlogo_path_marius = r"/content/drive/MyDrive/SWITCH/Python for Data Analysis/Projet Final/DataFrames/Mlogo.png"
vecto_path_samuel = r"/content/drive/MyDrive/pyhton/GoogleNews-vectors-negative300.bin"
online_news_popularity_path_samuel = r"/content/drive/MyDrive/pyhton/OnlineNewsPopularityWithAutorsAndTitles.csv"
multi_timeline_path_samuel = r"/content/drive/MyDrive/pyhton/multiTimeline.csv"
mlogo_path_samuel = r"/content/drive/MyDrive/pyhton/Mlogo.png"
In the cell bellow, you can add your username to the ifelse statement so your paths are linked to your username :
if user == "marius":
vecto_path = vecto_path_marius
online_news_popularity_path = online_news_popularity_path_marius
multi_timeline_path = multi_timeline_path_marius
mlogo_path = mlogo_path_marius
elif user == "samuel":
vecto_path = vecto_path_samuel
online_news_popularity_path = online_news_popularity_path_samuel
multi_timeline_path = multi_timeline_path_samuel
mlogo_path = mlogo_path_samuel
In the context of our project we studied the online-news-popularity dataset of mashable articles. Our project contains 2 csv datasets loaded in this section :
Online News Popularity Dataset
Timeline Dataset
news = pd.read_csv(online_news_popularity_path)
news.head(5)
news.shape
(39644, 63)
timeline = pd.read_csv(multi_timeline_path)
timeline.head(5)
timeline.shape
(103, 2)
In the prepocessing section, our objective in to make the data usable for visualisation and modeling usage.To do so, we have to perform multiple modifications on the dataset. Find the complete explanation below.
columns = news.columns
oldColumns = columns
columns = [elem.lstrip() for elem in columns]
dic_cols = {oldColumns[i]:columns[i] for i in range(len(columns))}
news.rename(columns=dic_cols, inplace=True)
scrap = False
if scrap:
urls = news["url"]
driver = webdriver.Chrome('chromedriver.exe')
driver.get(urls[0])
time.sleep(2)
element = driver.find_elements(by = By.CSS_SELECTOR, value = 'div[id="onetrust-button-group"]')
element[0].click()
authors = []
titles = []
authors = []
titles = []
for i in range(0, len(urls[i])):
driver.get(urls[i])
time.sleep(0.5)
try :
timer1 = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR,'a[class="underline-link"]')))
timer2 = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CSS_SELECTOR,'h1[class="mt-4 header-100 max-w-5xl "]')))
name = driver.find_elements(by = By.CSS_SELECTOR, value = 'a[class="underline-link"]')
title = driver.find_elements(by = By.CSS_SELECTOR, value = 'h1[class="mt-4 header-100 max-w-5xl "]')
authors.append(name[0].text)
titles.append(title[0].text)
except (TimeoutException , NameError):
authors.append('Nan')
titles.append('Nan')
news['Autors'] = authors
news['titles'] = titles
df.to_csv('OnlineNewsPopularityWithAutorsAndTitles.csv')
time = [x for x in range(8,732)]
temp = datetime.strptime("27/12/2014",'%d/%m/%Y')
date = []
for i in range(len(time)):
date.append(temp)
temp = temp - timedelta(1)
dico = {time[i]:date[i] for i in range(len(time))}
news['date'] = news['timedelta'].map(dico)
days = timeline['Semaine']
count = timeline['Mashable: (Dans tous les pays)']
days1 = []
for day in days:
days1.append(datetime.strptime(day,'%Y-%m-%d'))
days2 = []
count1 = []
days2.append(datetime.strptime('2013-01-03','%Y-%m-%d'))
days2.append(datetime.strptime('2013-01-04','%Y-%m-%d'))
days2.append(datetime.strptime('2013-01-05','%Y-%m-%d'))
count1.append(8.28)
count1.append(8.28)
count1.append(8.28)
for day in days1:
for i in range(7):
days2.append(day + timedelta(i))
for coun in count:
for i in range(7):
count1.append(coun/7)
count1.append(4.7)
count1.append(4.7)
count1.append(4.7)
count1.append(4.7)
count1.append(4.7)
count1.append(4.7)
days2.append(datetime.strptime('2014-12-22','%Y-%m-%d'))
days2.append(datetime.strptime('2014-12-23','%Y-%m-%d'))
days2.append(datetime.strptime('2014-12-24','%Y-%m-%d'))
days2.append(datetime.strptime('2014-12-25','%Y-%m-%d'))
days2.append(datetime.strptime('2014-12-26','%Y-%m-%d'))
days2.append(datetime.strptime('2014-12-27','%Y-%m-%d'))
dico = {str(days2[i])[:10]:count1[i] for i in range(len(days2))}
news['NbVisit'] = news['date'].astype(str).map(dico)
Class_shares2 = pd.qcut(news.shares,4,
labels=['very low','low' ,'high', 'very high'])
Class_shares1 = np.where(news['shares'] < 1400, 'low','high')
news['Class_shares1'] = Class_shares1
news['Class_shares2'] = Class_shares2
_=matrix(news)
news = news[news["n_tokens_content"]>0]
for i in range(4, 7):
news = news[((news.iloc[:, i] >= 0) & (news.iloc[:, i] <=1))]
summary_13_to_18 = []
for index in range(news.shape[0]):
if news.iloc[index, 13] == 1:
summary_13_to_18.append("LifeStyle")
elif news.iloc[index, 14] == 1:
summary_13_to_18.append("Entertainment")
elif news.iloc[index, 15] == 1:
summary_13_to_18.append("Business")
elif news.iloc[index, 16] == 1:
summary_13_to_18.append("Social Media")
elif news.iloc[index, 17] == 1:
summary_13_to_18.append("Tech")
elif news.iloc[index, 18] == 1:
summary_13_to_18.append("World")
else:
summary_13_to_18.append("Others")
news["Chanel"] = summary_13_to_18
for i in range(19, 28):
news.iloc[:, i] = [x if x>=0 else 0 for x in news.iloc[:, i]]
summary_31_to_37 = []
for index in range(news.shape[0]):
if news.iloc[index, 31] == 1:
summary_31_to_37.append("Monday")
elif news.iloc[index, 32] == 1:
summary_31_to_37.append("Tuesday")
elif news.iloc[index, 33] == 1:
summary_31_to_37.append("Wednesday")
elif news.iloc[index, 34] == 1:
summary_31_to_37.append("Thursday")
elif news.iloc[index, 35] == 1:
summary_31_to_37.append("Friday")
elif news.iloc[index, 36] == 1:
summary_31_to_37.append("Saturday")
elif news.iloc[index, 37] == 1:
summary_31_to_37.append("Sunday")
else:
summary_31_to_37.append("Unknown")
news["Weekday"] = summary_31_to_37
print(f'LDA_00 : {(news[(news["LDA_00"]<0) | (news["LDA_00"]<0)].url.count())}')
print(f'LDA_01 : {(news[(news["LDA_01"]<0) | (news["LDA_01"]<0)].url.count())}')
print(f'LDA_02 : {(news[(news["LDA_02"]<0) | (news["LDA_02"]<0)].url.count())}')
print(f'LDA_03 : {(news[(news["LDA_03"]<0) | (news["LDA_03"]<0)].url.count())}')
LDA_00 : 0 LDA_01 : 0 LDA_02 : 0 LDA_03 : 0
news[(news["LDA_03"].isna() == True)]
f, axs = plt.subplots(1, 2)
_=axs[0].hist(x=news["shares"], color = '#4C4EDD')
_=axs[1].hist(x=np.log(news["shares"]), color = '#E84855')
plt.tight_layout()
#news["shares"] = np.log(news["shares"])
print(f'Nan in Authors: {news[news["Authors"] == "Nan"].url.count()}')
print(f'Nan in Titles: {news[news["Titles"] == "Nan"].url.count()}')
Nan in Authors: 619 Nan in Titles: 619
news = news[news["Authors"] != "Nan"]
q1 = news['shares'].quantile(0.25) # First Quartile
q3 = news['shares'].quantile(0.75) # Third Quartile
IQR = q3 - q1 # Inter Quartile Range
llimit = q1 - 1.5*IQR # Lower Limit
ulimit = q3 + 1.5*IQR # Upper Limit
indexs = news[(news['shares'] < llimit) | (news['shares'] > ulimit)].index
news.drop(index =indexs, inplace = True)
/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
load = True
if load:
google_vecto_model = KeyedVectors.load_word2vec_format(vecto_path, binary=True)
news.reset_index(drop = True, inplace = True)
chanels = news.Chanel.unique()
chanels = [x if x != "Social Media" else "SocialMedia" for x in chanels]
vec_chanels = {(chanel):(google_vecto_model[chanel] if chanel != "Others" else [0 for i in range(300)]) for chanel in chanels}
vec_chanels["Social Media"] = vec_chanels.pop("SocialMedia")
all_vect_cols = []
vec_len = len(vec_chanels["Social Media"])
for i in range(vec_len):
all_vect_cols.append([])
for row in news["Chanel"]:
for i in range(vec_len):
all_vect_cols[i].append(vec_chanels[row][i])
name_cols_chanels_vec = [f"WChanel {i}" for i in range(vec_len)]
df_chanel_vec = pd.DataFrame(np.transpose(all_vect_cols), columns=name_cols_chanels_vec)
news = pd.concat((news, df_chanel_vec), axis = 1)
export = False
v_news = news.copy()
vecto_cols = [f"WChanel {i}" for i in range(300)]
_=v_news.drop(columns=vecto_cols, inplace = True)
_=v_news.drop(columns=['data_channel_is_lifestyle'], inplace = True)
_=v_news.drop(columns=['data_channel_is_entertainment'], inplace = True)
_=v_news.drop(columns=['data_channel_is_bus'], inplace = True)
_=v_news.drop(columns=['data_channel_is_socmed'], inplace = True)
_=v_news.drop(columns=['data_channel_is_tech'], inplace = True)
_=v_news.drop(columns=['data_channel_is_world'], inplace = True)
_=v_news.drop(columns=['weekday_is_monday'], inplace = True)
_=v_news.drop(columns=['weekday_is_tuesday'], inplace = True)
_=v_news.drop(columns=['weekday_is_wednesday'], inplace = True)
_=v_news.drop(columns=['weekday_is_thursday'], inplace = True)
_=v_news.drop(columns=['weekday_is_friday'], inplace = True)
_=v_news.drop(columns=['weekday_is_saturday'], inplace = True)
_=v_news.drop(columns=['weekday_is_sunday'], inplace = True)
_=v_news.drop(columns=['url'], inplace = True)
_=v_news.drop(columns=['timedelta'], inplace = True)
m_news = news.copy()
Firstly, we remove columns that won't be used for prediction.
_=m_news.drop(columns=['Chanel'], inplace = True)
_=m_news.drop(columns=['Weekday'], inplace = True)
_=m_news.drop(columns=['url'], inplace = True)
_=m_news.drop(columns=['date'], inplace = True)
_=m_news.drop(columns=['Authors'], inplace = True)
_=m_news.drop(columns=['Titles'], inplace = True)
_=m_news.drop(columns=['timedelta'], inplace = True)
_=m_news.drop(columns=['is_weekend'], inplace = True)
x_train_selector.isna().sum().sum()
0
m_news.reset_index(drop=True, inplace=True)
m_news = pd.concat((m_news[selected_name_columns], m_news[exclude]), axis = 1)
nb_dim = 5
vec_dim = 300
sub_vec = m_news.loc[:, "WChanel 0":"WChanel 299"]
pca = PCA(n_components=nb_dim)
pca.fit(sub_vec)
explanation_coefs = pca.explained_variance_ratio_
vec_pca_values = pca.transform(sub_vec)
print(f"Explained information: {round(np.sum(explanation_coefs), 2)}")
df_reduced_vec = pd.DataFrame(vec_pca_values)
dic_vec_labels = {i:f"WChanel {i}" for i in range(nb_dim)}
df_reduced_vec.rename(columns=dic_vec_labels, inplace=True)
vec_labels = [f"WChanel {i}" for i in range(vec_dim)]
m_news.drop(columns=vec_labels, inplace=True)
m_news = pd.concat((m_news, df_reduced_vec), axis = 1)
m_news.head(2)
if export:
m_news.to_csv("m_news.csv")
This section is dedicated to understanding how our data behaves through mutliple graphs. Our work in this part can be split in two categories :
Univariate Visualization :
Multivariate Visualization :
week_ref = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
Weekday_counts = v_news.groupby(by="Weekday").Weekday.count().reindex(week_ref)
index = list(Weekday_counts.index)
values = list(Weekday_counts.values)
source = ColumnDataSource(data=OrderedDict(weekday=index, counts=values))
p = figure(x_range=index, y_range=(0, np.max(values)+1000), height=350, title="Release Day Frequencies",
toolbar_location=None, tools="hover", tooltips="@weekday: @counts")
p.vbar(x='weekday', top='counts', width=0.9, source=source,
line_color='white', fill_color=factor_cmap('weekday', palette=Magma7, factors=index))
p.xgrid.grid_line_color = None
_=show(p, notebook_handle=True)
chanel = v_news.groupby(by="Chanel").Chanel.count()
fig = go.Figure(data=go.Scatterpolar(
r=chanel.values,
theta=chanel.index,
fill='toself',
name='Frequencies of Chanels',
marker_color=Magma7[2],
line_color = Magma7[2],
marker_line_color="black",
))
fig.update_layout(
title={
'text': "Frequencies of Chanels",
'y': 0.9,
'x': 0.5
},
polar=dict(
radialaxis=dict(
visible=True,
),
),
showlegend=False,
)
fig.show()
autnb = v_news.groupby('Authors').count()['Titles'].sort_values(ascending = False)[:10]
autnb =autnb.reset_index()
pie_values=autnb["Titles"]
pie_labels=autnb["Authors"]
explode = [0.1 if i == 0 else 0 for i in range(10)]
fig1, ax1 = plt.subplots()
ax1.pie(pie_values, labels=pie_labels,
shadow=True, startangle=0, colors=Magma10, explode=explode)
plt.title("10 Most prolific Authors on Mashable")
dates_by_months = v_news.groupby(v_news['date'].dt.strftime('%Y-%m')).date.count()
fig = go.Figure()
fig.add_trace(go.Scatter(x=dates_by_months.index, y=dates_by_months.values, name="spline",
text=["tweak line smoothness<br>with 'smoothing' in line object"],
hoverinfo='x+y',
line_shape='spline',
line_color = Magma7[2]))
fig.update_layout(
title={
'text': "Frequencies of Releases",
'y': 0.9,
'x': 0.5},
showlegend=False,
)
words = []
for x in v_news['Titles']:
for y in x.split(' '):
words.append(y)
text = ' '.join(words)
mask = np.array(Image.open(mlogo_path))
mask[mask == 0] = 255
wordcloud = WordCloud(background_color = 'white', max_words = 200, mask =mask,contour_width=1).generate(text)
i = 0
def couleur(*args, **kwargs):
import random
global i
if i < 11:
i = i+1
return "rgb(255, 0, 0)"
if i > 10:
return "rgb(0, 0, 0)"
plt.subplots(figsize=(60, 20))
plt.imshow(wordcloud.recolor(color_func = couleur))
plt.imshow(wordcloud)
plt.axis("off")
plt.show();
px.histogram(v_news, x="shares", color_discrete_sequence = [Magma7[2]],
title = "Continuous shares distribution")
px.histogram(v_news, x="Class_shares1", color_discrete_sequence = [Magma7[2]],
title = "Binary discretized shares distribution")
px.histogram(v_news, x="Class_shares2", color_discrete_sequence = [Magma7[2]],
title = "Quantile discretized shares distribution")
v_news.hist(figsize=(20,20), color=Magma7[2])
plt.show()
plt.tight_layout()
lda = v_news.loc[:, "LDA_00":"LDA_04"]
pca = PCA(n_components=3)
pca.fit(lda)
explanation_coefs = pca.explained_variance_ratio_
pca_values = pca.transform(lda)
np.sum(explanation_coefs)
0.8369528735695806
lda["Chanel"] = v_news["Chanel"]
map_colors = {
"Entertainment":"limegreen",
"Business": "darkturquoise",
"Tech": "orchid",
"LifeStyle": "royalblue",
"World": "gold",
"Others": "tomato",
"Social Media": "pink"
}
colors = v_news["Chanel"].map(map_colors)
fig = px.scatter_3d(x=pca_values[:200, 0], y=pca_values[:200, 1], z=pca_values[:200, 2],
color = lda["Chanel"].head(200), opacity=0.7, color_discrete_sequence=Magma7)
fig.update_layout(
margin=dict(l=0, r=0, b=0, t=0),
legend_title="Chanels",
title={
'text': "3D plot of articles in PCA reduced LDA topics space",
'y': 0.9,
'x': 0.45
}
)
warnings.filterwarnings('ignore')
top_10_authors = v_news.groupby(by="Authors").shares.sum().sort_values(ascending=False).head(10).index.values
top_10_authors
top_10_authors_race = v_news[v_news["Authors"].isin(top_10_authors)].reset_index(drop=True)[["Authors", "shares", "date"]]
temp_date = pd.to_datetime(top_10_authors_race["date"]).dt.strftime('%m/%Y')
top_10_authors_race["date"] = pd.to_datetime(temp_date)
table_top_10_authors_race = pd.pivot_table(top_10_authors_race,
index="Authors",
columns="date",
values="shares",
aggfunc="sum").fillna(0).cumsum(axis=1)
table_top_10_authors_race = table_top_10_authors_race.T
bcr.bar_chart_race(
df=table_top_10_authors_race,
#filename='covid19_horiz.mp4',
orientation='h',
sort='desc',
n_bars=10,
fixed_order=False,
fixed_max=True,
steps_per_period=2,
interpolate_period=False,
label_bars=True,
bar_size=.95,
period_label={'x': .99, 'y': .25, 'ha': 'right', 'va': 'center'},
period_fmt='%Y/%m',
period_summary_func=lambda v, r: {'x': .99, 'y': .18,
's': f'Nombre total de partages: {v.nlargest(6).sum():,.0f}',
'ha': 'right', 'size': 8, 'family': 'Courier New'},
perpendicular_bar_func='median',
period_length=150,
figsize=(5, 3),
dpi=144,
cmap=Magma10,
title='TOP 10 most influencial authors of Mashable over time',
title_size='',
bar_label_size=7,
tick_label_size=7,
writer=None,
fig=None,
bar_kwargs={'alpha': .9},
filter_column_colors=False)
df = v_news
final = pd.DataFrame()
df2 = df.groupby('Authors').count()['Titles'].to_dict()
df['count'] = df['Authors'].map(df2)
df = df.sort_values('count',ascending=False)
aut = df['Authors'].unique()[:10]
for x in aut:
temp = df[df['Authors']== x]
final = pd.concat([final,temp.iloc[:3]])
final = final[['Authors','Titles','count']]
fig = px.sunburst(final, path=['Authors', 'Titles'], values='count', color_discrete_sequence=Magma10,
title = "Sunburst of the most famous publishers and their 3 most shared articles")
fig.show()
corr = v_news.corr()
fig = px.imshow(corr,color_continuous_scale='RdBu_r', text_auto=True, title="Correlation heatmap of all variables")
fig.show()
pol_subj_wr_shares = v_news[["global_subjectivity", "avg_positive_polarity", "avg_negative_polarity", "shares"]]
discrete_pol_subj_wr_shares = pd.DataFrame()
bins_v = [round(x, 1) for x in np.arange(0, 1.1, 0.1)]
labels_v = [round(i, 1) for i in np.arange(0, 1, 0.1)]
bins_v_neg = [round(x, 1) for x in np.arange(-1, 0.1, 0.1)]
labels_v_neg = [round(i, 1) for i in np.arange(-1, 0, 0.1)]
discrete_pol_subj_wr_shares["global_subjectivity"] = pd.cut(pol_subj_wr_shares["global_subjectivity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_positive_polarity"] = pd.cut(pol_subj_wr_shares["avg_positive_polarity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v_neg,
labels = labels_v_neg)
discrete_pol_subj_wr_shares["shares"] = v_news["shares"]
discrete_pol_subj_wr_shares.reset_index(drop=True, inplace = True)
discrete_table_pol_positive_subj_wr_shares = pd.pivot_table(discrete_pol_subj_wr_shares,
index="global_subjectivity",
columns="avg_positive_polarity",
values="shares",
aggfunc="sum").fillna(0)
discrete_table_pol_negative_subj_wr_shares = pd.pivot_table(discrete_pol_subj_wr_shares,
index="global_subjectivity",
columns="avg_negative_polarity",
values="shares",
aggfunc="sum").fillna(0)
f, axs = plt.subplots(1, 2, figsize = (20, 8))
sns.heatmap(discrete_table_pol_positive_subj_wr_shares, ax = axs[0], cmap="Greens")
sns.heatmap(discrete_table_pol_negative_subj_wr_shares, ax = axs[1], cmap="Reds")
axs[0].set_title("Average Positivity", size = 16)
axs[1].set_title("Average Negativity", size=16)
plt.suptitle("Subjectivity, polarity and their effects on shares growth", size = 20)
px.histogram(v_news, x="global_sentiment_polarity", y="shares", color="Class_shares1", color_discrete_sequence=Magma4, title="Global sentiment polarity with respect to the number of shares")
px.histogram(v_news, x="global_sentiment_polarity", y="shares", color="Class_shares2", color_discrete_sequence=Magma4, title = "Global sentiment polarity with respect to the number of shares")
n_shares_wr_author_chanel = pd.pivot_table(v_news,
index="Authors",
columns="Chanel",
values="shares",
aggfunc="sum").fillna(0)
n_shares_wr_author_chanel["cumsum"] = np.sum(n_shares_wr_author_chanel, axis=1)
n_shares_wr_author_chanel.sort_values(by="cumsum", ascending=False, inplace=True)
n_shares_wr_author_chanel.drop(labels="cumsum", axis=1, inplace=True)
top10_n_shares_wr_author_chanel = n_shares_wr_author_chanel.head(10)
fig = px.bar(top10_n_shares_wr_author_chanel, height=400,
color_discrete_sequence=Magma7,
title="Number of shares per author and types of articles written")
fig.show()
The following graph shows the amount of shares with respect to the day of the week. It appears that posting your articles between Tuesday and Thursday is the most optimal way to get at the top of the charts 📈
Source for the graph : https://www.python-graph-gallery.com/web-circular-barplot-with-matplotlib
week_ref = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dor = v_news[["Weekday", "shares"]].groupby(by="Weekday").sum().reindex(week_ref)
means_5000 = (v_news[["Weekday", "shares"]].groupby(by="Weekday").mean().values * 2500)
angles = np.linspace(0.05, 2 * np.pi - 0.05, dor.shape[0], endpoint=False)
GREY12 = "#808080"
plt.rcParams.update({"font.family": "Bell MT"})
plt.rc("axes", unicode_minus=False)
f, ax = plt.subplots(figsize=(7, 10), subplot_kw={"projection": "polar"})
f.patch.set_facecolor("white")
ax.set_facecolor("white")
ax.set_theta_offset(1.2 * np.pi / 2)
ax.set_ylim(-5000000, 12000000)
ax.bar(angles,
dor["shares"],
width=0.80,
color = Magma7,
alpha = 0.9,
zorder=10)
ax.vlines(angles,
0,
12000000,
color=GREY12,
ls=(0, (4, 4)),
zorder=11)
ax.scatter(angles,
means_5000,
s=60,
color=GREY12,
zorder=11)
ax.xaxis.grid(False)
ax.spines["start"].set_color("none")
ax.spines["polar"].set_color("none")
ax.set_xticks(angles)
ax.set_yticklabels([])
ax.set_yticks([0, 4000000, 8000000, 12000000])
XTICKS = ax.xaxis.get_major_ticks()
for tick in XTICKS:
tick.set_pad(10)
PAD = 10
ax.text(-0.3*np.pi / 2, 4000000 + PAD, "5M", ha="center", size=12)
ax.text(-0.3*np.pi / 2, 8000000 + PAD, "10M", ha="center", size=12)
ax.text(-0.3*np.pi / 2, 12000000 + PAD, "15M", ha="center", size=12)
ax.set_xticklabels(week_ref, size=12);
_=plt.title("BarPlot of shares with respect to the day of the week")
The 3 following heatmap tend to indicate what are the optimal numbers of medias to include in your article.
From this study, the optimal parameters seem to be :
To sum up, be nice and quote your sources ✅
❗This result may be biased because there are less articles with videos and images on mashable.com
img_vid_wr_shares = v_news[["num_hrefs", "num_imgs", "num_videos", "shares"]]
table1_img_vid_wr_shares = pd.pivot_table(img_vid_wr_shares,
index="num_hrefs",
columns="num_imgs",
values="shares",
aggfunc="sum").fillna(0)
fig1 = px.density_heatmap(img_vid_wr_shares,
x="num_hrefs",
y="num_imgs",
z="shares",
marginal_x="histogram",
marginal_y="histogram",
histfunc="sum",
color_continuous_scale=Purples9)
fig2 = px.density_heatmap(img_vid_wr_shares,
x="num_hrefs",
y="num_videos",
z="shares",
marginal_x="histogram",
marginal_y="histogram",
histfunc="sum",
color_continuous_scale=Greens9)
fig3 = px.density_heatmap(img_vid_wr_shares,
x="num_imgs",
y="num_videos",
z="shares",
marginal_x="histogram",
marginal_y="histogram",
histfunc="sum",
color_continuous_scale=YlOrRd9)
fig1.update_layout(xaxis_range=[0,30])
fig1.update_layout(yaxis_range=[0,10])
fig2.update_layout(xaxis_range=[0,30])
fig2.update_layout(yaxis_range=[0,10])
fig3.update_layout(xaxis_range=[0,30])
fig3.update_layout(yaxis_range=[0,10])
fig1.show()
fig2.show()
fig3.show()
Interpreting the barplot that follows, we notice that all chanels are relatively close in terms of numbers of shares. However, Lifestyle, Tech and Social Media chanels seem to be sightly dominant.
If you want the most of your articles you may talk about these 3 topics.
df = pd.DataFrame()
df['chanel'] = v_news["Chanel"]
df['shares'] = v_news['shares']
df2 = df.groupby('chanel').count().reset_index()
df = df.groupby('chanel').sum().reset_index()
df['count'] = df2['shares']
df['shares_nb'] = df['shares']/df['count']
df = df.drop(labels=3, axis=0)
fig = px.bar(df, x='chanel', y='shares_nb', color = 'chanel', color_discrete_sequence=Magma6)
fig.show()
🔍 Given the study on the title's length of articles we infer that the ideal title should either:
df = pd.DataFrame()
df['shares'] = v_news['shares']
df['lentitle'] = v_news['n_tokens_title']
df2 = df.groupby('lentitle').count().reset_index()
df = df.groupby('lentitle').sum().reset_index()
df['count'] = df2['shares']
df['shares_nb'] = df['shares']/df['count']
fig = px.bar(df, x='lentitle', y='shares_nb', title="Shares with respect to the Bar plot of the length of Titles")
fig.update_traces(marker_color=Magma7[2])
fig.show()
In this section, we find out that the optimal length is between 1000 and 3000 words long. Most successful articles fall in this category and the amount of shares seems to follow through !
🔍 Considering that a line is 9 to 12 words long you can go wrong if you publish an article between 100 to 300 lines.
📚 Source : Length of a line
df = pd.DataFrame()
df['shares'] = v_news['shares']
df['lencorpus'] = v_news['n_tokens_content']
df2 = df.groupby('lencorpus').count().reset_index()
df = df.groupby('lencorpus').sum().reset_index()
df['count'] = df2['shares']
df['shares_nb'] = df['shares']/df['count']
fig = px.scatter(df, x='lencorpus', y='shares_nb',trendline='ols', title="Scatter plot of the length of the body with respect to shares")
fig.update_traces(marker_color=Magma7[2])
fig.show()
fig = px.scatter(v_news, x='n_tokens_content', y='shares',color = 'Chanel',trendline='ols', color_discrete_sequence=Magma7, title="Scatter plot of shares with respect to the chanel and the amount of words in the article's body")
fig.show()
Here, we are trying to find out the optimal average length of your article's content.
🔍 After the study of the scatter plot bellow, we can conclude that the optimal length of words is 4 to 5 characters.
❗In this part, we suppose that the length of a word is correlated to its complexity. Consequently, we may analyze the result of the subsection through the complexity aspect.
df = pd.DataFrame()
df['shares'] = v_news['shares']
df['lenwords'] = v_news['average_token_length']
df2 = df.groupby('lenwords').count().reset_index()
df = df.groupby('lenwords').sum().reset_index()
df['count'] = df2['shares']
df['shares_nb'] = df['shares']/df['count']
fig = px.scatter(df, x='lenwords', y='shares_nb',trendline='ols', title="Complexity of words with respect to shares")
fig.update_traces(marker_color=Magma7[2])
fig.show()
fig = px.scatter(v_news, x='average_token_length', y='shares',color = 'Chanel',trendline='ols', color_discrete_sequence=Magma7,
title="Complexity of words with respect to chanels and shares")
fig.show()
The following section aims at predicting the success of an article in the most precise way possible. To do so, we will use multiple types of prediction algorithms. We will use 2 class of algorithms, Machine Learning and Deep Learning ones. Amongst them, We find classification and regression algorithms that we will both use as well. In addition, we will use the Auto-Sklearn library to maximize our result.
⚙ Auto-sklearn is a program that tries multiple algorithms from sk-learn library. You input the amount of time that you allow the autoMl to run for, you also input the type of algorithm that you desire (Regression or Classificaition). In the end, the program will output a list of its best found Machine Learning models with the best found hyperparameters for each one.
⚡Each Algorithm's functionning is detailed in its own subpart.
I. Machine Learning
II. Deep Learning
First, we separate explainatory variables from the target variable.
search = False
lm_data = m_news.copy()
del lm_data['Class_shares2']
del lm_data['Class_shares1']
x = lm_data.loc[:, lm_data.columns != "shares"]
y = lm_data.loc[:, lm_data.columns == "shares"].values.ravel()
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
object = RobustScaler()
x_train_scale = object.fit_transform(x_train)
x_test_scale = object.transform(x_test)
automl = autosklearn.regression.AutoSklearnRegressor(
time_left_for_this_task=3600,
per_run_time_limit=600,
memory_limit = 35000
)
if search:
automl.fit(x_train_scale, y_train)
if search:
automl.leaderboard()
if search:
automl.score(x_test_scale, y_test)
if search:
automl.show_models()
🧠 ARD Regression is a Machine Learning model derived from Bayesian Ridge Regression.
model = ARDRegression(alpha_1=2.7664515192592053e-05, alpha_2=9.504988116581138e-07,
copy_X=False, lambda_1=4.184987756432487e-09,
lambda_2=4.238533890074848e-07,
threshold_lambda=78251.58542976103, tol=0.0006951835906397672)
model.fit(x_train_scale,y_train)
model.score(x_test_scale,y_test)
0.11736464223434451
cross_validate(model, x_train_scale, np.ravel(y_train), cv = 5)
cv = StratifiedKFold(n_splits=5)
visualizer = LearningCurve(model, scoring='r2')
visualizer.fit(x_train_scale, np.ravel(y_train)) # Fit the data to the visualizer
visualizer.show()
columns = ['Name','Accuracy','Precision','Recall','F1']
result = pd.DataFrame(columns = columns)
class_data = m_news.copy()
del class_data['shares']
del class_data['Class_shares2']
x = class_data.loc[:, class_data.columns != "Class_shares1"]
y = class_data.loc[:, class_data.columns == "Class_shares1"].values.ravel()
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
y_train = np.where(y_train == 'high',1,0)
y_test = np.where(y_test == 'high',1,0)
object = RobustScaler()
x_train_scale = object.fit_transform(x_train)
x_test_scale = object.transform(x_test)
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=30000,per_run_time_limit=3600, memory_limit = 25000)
automl.fit(x_train_scale, y_train)
automl.leaderboard()
automl.score(x_test_scale, y_test)
🧠 Binning is the segmentation and convertion to numerical value of a set of data, that is what we do when we create a histogram. This same binning concept is applied to the Decision Tree (DT) algorithm. By reducing the number of features, it will be used to increase the algorithm’s speed. As a result, the same notion is employed in DT by grouping with histograms, which is known as the HGB classifier.
📚 Source : AnalyticsVidhya
Tokenization is skipped for long lines for performance reasons. This can be configured via editor.maxTokenizationLineLength.
model = HistGradientBoostingClassifier()
model.fit(x_train_scale,y_train)
pred = model.predict(x_test_scale)
model.score(x_test_scale,y_test)
0.6671159029649596
y_pred = model.predict(x_test_scale)
conf = confusion_matrix(y_test, y_pred)
model = HistGradientBoostingClassifier(early_stopping=True, l2_regularization=1e-10,
learning_rate=0.13046552565826827, max_iter=64,
max_leaf_nodes=41, min_samples_leaf=40,
n_iter_no_change=6, random_state=1,
warm_start=True)
model.fit(x_train_scale, y_train)
model.score(x_test_scale,y_test)
0.6725067385444744
model = HistGradientBoostingClassifier(early_stopping=True,
l2_regularization=3.609412172481434e-10,
learning_rate=0.05972079854295879, max_iter=512,
max_leaf_nodes=4, min_samples_leaf=2,
n_iter_no_change=14, random_state=1,
validation_fraction=None, warm_start=True)
model.fit(x_train_scale, y_train)
model.score(x_test_scale,y_test)
0.6701108116202456
parameters_hgb = [{'max_iter': [1000,1200,1500],
'learning_rate': [0.1],
'max_depth' : [25, 50, 75],
'l2_regularization': [1.5],
'scoring': ['f1_micro']}]
grid_search_hgb = GridSearchCV(estimator = HistGradientBoostingClassifier(),
param_grid = parameters_hgb,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search_hgb.fit(x_train_scale, y_train)
grid_search_hgb.best_score_
0.6631349198002684
grid_search_hgb.best_params_
{'l2_regularization': 1.5,
'learning_rate': 0.1,
'max_depth': 75,
'max_iter': 1200,
'scoring': 'f1_micro'}y_pred = grid_search_hgb.predict(x_test_scale)
accuracy_score(y_test,y_pred)
0.6719077568134172
result.loc[0] = ['HistGradientBoostingClassifier',accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred)]
fig = px.imshow(conf,color_continuous_scale='RdBu_r')
fig.show()
model = HistGradientBoostingClassifier(early_stopping=True,
l2_regularization=1.5,
learning_rate=0.1, max_iter=1000,
max_leaf_nodes=4, min_samples_leaf=2,
n_iter_no_change=14, random_state=1,
validation_fraction=None, warm_start=True)
model.fit(x_train_scale, y_train)
model.score(x_test_scale,y_test)
0.6737047020065888
train_sizes, train_scores, test_scores = learning_curve(model, x_train_scale, y_train, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()
🧠 AdaBoost, short for Adaptive Boosting, is a statistical classification meta-algorithm formulated by Yoav Freund and Robert Schapire in 1995, who won the 2003 Gödel Prize for their work. Usually, AdaBoost is presented for binary classification, although it can be generalized to multiple classes or bounded intervals on the real line.
AdaBoost is adaptive in the sense that subsequent weak learners are tweaked in favor of those instances misclassified by previous classifiers. In some problems it can be less susceptible to the overfitting problem than other learning algorithms. The individual learners can be weak, but as long as the performance of each one is slightly better than random guessing, the final model can be proven to converge to a strong learner.
📚 Source : AdaBoost Wikipedia
Tokenization is skipped for long lines for performance reasons. This can be configured via editor.maxTokenizationLineLength.
model = AdaBoostClassifier()
model.fit(x_train_scale, y_train)
model.score(x_test_scale, y_test)
0.6602276130578018
model = AdaBoostClassifier(algorithm='SAMME',
base_estimator=DecisionTreeClassifier(max_depth=2),
learning_rate=0.03734246906377268, n_estimators=416,
random_state=1)
model.fit(x_train_scale, y_train)
model.score(x_test_scale, y_test)
0.655884995507637
y_pred = model.predict(x_test_scale)
accuracy_score(y_test,y_pred)
0.655884995507637
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [200, 416],
"algorithm" : ['SAMME']
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = 4)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')
grid_search_ABC.fit(x_train_scale, y_train)
grid_search_ABC.best_params_
{'algorithm': 'SAMME',
'base_estimator__criterion': 'gini',
'base_estimator__splitter': 'best',
'n_estimators': 416}grid_search_ABC.score(x_test_scale, y_test)
0.7072761772468598
y_pred = grid_search_ABC.predict(x_test_scale)
fig = px.imshow(conf,color_continuous_scale='RdBu_r')
fig.show()
result.loc[1] = ['AdaBoostClassifier',accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred)]
train_sizes, train_scores, test_scores = learning_curve(model, x_train_scale, y_train, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 5))
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()
Tokenization is skipped for long lines for performance reasons. This can be configured via editor.maxTokenizationLineLength.
🧠 LightGBM, short for light gradient-boosting machine, is a free and open-source distributed gradient-boosting framework for machine learning, originally developed by Microsoft. It is based on decision tree algorithms and used for ranking, classification and other machine learning tasks. The development focus is on performance and scalability.
📚 Source : Wikipedia LightGBM
import lightgbm as lgb
lgbm=lgb.LGBMClassifier(n_estimators= 100, boosting_type= 'gbdt', colsample_bytree= 0.8, learning_rate= 0.09, max_depth=30).fit(x_train_scale, y_train)
y_pred = lgbm.predict(x_test_scale)
accuracy_score(y_test,y_pred)
0.6722072476789458
lgbm.score(x_test_scale,y_test)
0.6722072476789458
result.loc[4] = ['lightgbm',accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred)]
Tokenization is skipped for long lines for performance reasons. This can be configured via editor.maxTokenizationLineLength.
XgBoost stands for Extreme Gradient Boosting, which was proposed by the researchers at the University of Washington. It is a library written in C++ which optimizes the training for Gradient Boosting.
🧠 In this algorithm, decision trees are created in sequential form. Weights play an important role in XGBoost. Weights are assigned to all the independent variables which are then fed into the decision tree which predicts results. The weight of variables predicted wrong by the tree is increased and these variables are then fed to the second decision tree. These individual classifiers/predictors then ensemble to give a strong and more precise model. It can work on regression, classification, ranking, and user-defined prediction problems.
📚 Source : GeeksForGeeks
from xgboost import XGBClassifier
# fit model no training data
model = XGBClassifier()
model.fit(x_train_scale, y_train)
XGBClassifier()
0.6689128481581311
XGBClassifier(base_score=0.46862363336827917, colsample_bytree=0.9,
early_stopping_rouds=50, eta=0.08, eval_metric='auc',
max_depth=20, seed=666, subsample=0.9)y_pred = model.predict(x_test_scale)
accuracy_score(y_test,y_pred)
0.6572327044025157
estimator = XGBClassifier(
objective= 'binary:logistic',
nthread=4,
seed=42
)
parameters = {
'max_depth': range (2, 10, 1),
'n_estimators': range(60, 220, 40),
'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 10,
cv = 10,
verbose=True
)
🧠 Random Forest is a bagging algorithm that has the specificity to take a subset of the explainatory variables for each tree that it builds unlike classical bagging that takes all of them. This prevents the different learning trees from being correlated.
📚 Source : Random Forest Wikipedia
model = RandomForestClassifier()
model.fit(x_train_scale, y_train)
model.score(x_test_scale, y_test)
0.6587301587301587
model = RandomForestClassifier(bootstrap=False, max_features=15, min_samples_leaf=2,
min_samples_split=15, n_estimators=512, n_jobs=1,
random_state=1, warm_start=True)
model.fit(x_train_scale, y_train)
model.score(x_test_scale, y_test)
0.6638215034441449
model = RandomForestClassifier(max_features=7, n_estimators=512, n_jobs=1,
random_state=1, warm_start=True)
model.fit(x_train_scale, y_train)
model.score(x_test_scale, y_test)
0.6660676849356094
y_pred = model.predict(x_test_scale)
result.loc[2] = ['RandomForestClassifier',accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred)]
par = {'bootstrap': [True],
'max_depth': [10, 20, 30, 40],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2],
'min_samples_split': [2],
'n_estimators': [200, 400, 600]}
grid_search_RF = GridSearchCV(RandomForestClassifier(), param_grid=par, scoring = 'roc_auc')
🧠 KNN as a classification algorithm allows determines the class of an unknown individual based on the class of its K nearest neighbors.
neigh = KNeighborsClassifier(n_neighbors=80)
neigh.fit(x_train_scale, y_train)
KNeighborsClassifier(n_neighbors=80)
y_pred = neigh.predict(x_test_scale)
accuracy_score(y_test,y_pred)
0.6337226714585206
result.loc[3] = ['KNeighborsClassifier',accuracy_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred),f1_score(y_test,y_pred)]
k_range = np.arange(1,100)
accuracy = []
for n in k_range:
neigh = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
neigh.fit(x_train_scale, y_train)
# predict the result
y_pred = neigh.predict(x_test_scale)
accuracy.append(100*accuracy_score(y_pred, y_test))
plt.figure(figsize=(20,13))
plt.plot(k_range, accuracy, 'r-', label='KNN Accuracy Vs KNN Neighbors size')
plt.plot(k_range, accuracy, 'bx')
plt.xlabel('KNN Neighbors size')
plt.ylabel('KNN Accuracy')
plt.legend()
plt.grid()
plt.title('KNN Accuracy Vs Neighbors size')
plt.show()
result
result.to_csv("result.csv")
fig = go.Figure(data=[
go.Bar(name='Accuracy', x=result['Name'], y=result['Accuracy']),
go.Bar(name='Precision', x=result['Name'], y=result['Precision']),
go.Bar(name='Recall', x=result['Name'], y=result['Recall']),
go.Bar(name='F1', x=result['Name'], y=result['F1']),
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
class_data = m_news.copy()
del class_data['shares']
del class_data['Class_shares1']
le = preprocessing.LabelEncoder()
le.fit(class_data['Class_shares2'])
class_data['Class_shares2'] = le.transform(class_data['Class_shares2'])
x = class_data.loc[:, class_data.columns != "Class_shares2"]
y = class_data.loc[:, class_data.columns == "Class_shares2"].values.ravel()
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
object = RobustScaler()
x_train_scale = object.fit_transform(x_train)
x_test_scale = object.transform(x_test)
🧠 If you want informations about the functionning of this algorithm please check the Binary Classification section in which it is explained.
model = HistGradientBoostingClassifier(early_stopping=True,
l2_regularization=2.506856350040198e-06,
learning_rate=0.04634380160611007, max_iter=512,
max_leaf_nodes=11, min_samples_leaf=41,
n_iter_no_change=17, random_state=1,
validation_fraction=None, warm_start=True)
model.fit(x_train_scale,y_train)
model.score(x_test_scale,y_test)
0.4002695417789757
y_pred = model.predict(x_test_scale)
conf = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf,color_continuous_scale='RdBu_r')
fig.show()
🧠 If you want informations about the functionning of this algorithm please check the Binary Classification section in which it is explained.
model = RandomForestClassifier(criterion='entropy', max_features=11,
min_samples_leaf=17, min_samples_split=5,
n_estimators=512, n_jobs=1, random_state=1,
warm_start=True)
model.fit(x_train_scale,y_train)
model.score(x_test_scale,y_test)
0.40236597783767597
y_pred = model.predict(x_test_scale)
conf = confusion_matrix(y_test, y_pred)
fig = px.imshow(conf,color_continuous_scale='RdBu_r')
fig.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.38 0.49 0.43 1857
1 0.36 0.28 0.32 1987
2 0.33 0.09 0.14 948
3 0.46 0.61 0.52 1886
accuracy 0.41 6678
macro avg 0.39 0.37 0.35 6678
weighted avg 0.39 0.41 0.38 6678
class_data = m_news.copy()
del class_data['shares']
del class_data['Class_shares1']
x = class_data.loc[:, class_data.columns != "Class_shares2"]
y = class_data.loc[:, class_data.columns == "Class_shares2"]
object = StandardScaler()
x_scale = object.fit_transform(x)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(x_train_scale)
reduced_data.shape
plt.figure(figsize=(10,10))
plt.plot(reduced_data[:,0], reduced_data[:,1], 'r.')
plt.title('PCA Transformation')
plt.show()
tsne = TSNE(n_components=2, n_iter=300)
reduced_tsne = tsne.fit_transform(x_train_scale)
# plotting the clusters TSNE
plt.figure(figsize=(10,10))
plt.plot(reduced_tsne[:,0], reduced_tsne[:,1], 'r.')
plt.title('TSNE Transformation')
plt.show()
🧠 k-means clustering is a method of vector quantization, originally from signal processing, that aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean (cluster centers or cluster centroid), serving as a prototype of the cluster.
📚 Source : K-means Wikipedia
k=list(range(1,9))
ssd=[]
for i in k:
kmeans=KMeans(n_clusters=i).fit(reduced_tsne)
ssd.append(kmeans.inertia_)
plt.plot(k,ssd,'o-')
plt.xlabel('k')
plt.ylabel('Sum of squared error')
plt.show()
kmeans=KMeans(init='k-means++',n_clusters=5)
kmeans.fit(reduced_tsne)
kmeans_preds=kmeans.predict(reduced_tsne)
col = []
for x in kmeans_preds:
if x == 0:
col.append('black')
if x == 1:
col.append('blue')
if x == 2:
col.append('red')
if x == 3:
col.append('orange')
if x == 4:
col.append('green')
centroids = kmeans.cluster_centers_
clusters = np.unique(kmeans_preds)
# ploting the result of of the clusters
ax, fig = plt.subplots(figsize=(15,7))
plt.scatter(reduced_tsne[:,0],reduced_tsne[:,1],c=col,marker='.',)
plt.show()
In this section dedicated to Deep Learning, we will use Keras Sequential neural networks.
⚙ How do they work ?
💭 Why do we use it ?
Tokenization is skipped for long lines for performance reasons. This can be configured via editor.maxTokenizationLineLength.
reg_dl_news = m_news.copy()
reg_dl_news.drop(columns=["Class_shares1", "Class_shares2"], inplace=True)
x = reg_dl_news.loc[:, reg_dl_news.columns != "shares"]
y = reg_dl_news.loc[:, reg_dl_news.columns == "shares"]
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2)
object = preprocessing.MinMaxScaler()
x_train = object.fit_transform(x_train)
x_test = object.transform(x_test)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
scaler_target = preprocessing.RobustScaler()
y_train_scaled = scaler_target.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = scaler_target.transform(y_test.reshape(-1, 1))
First Model :
model = Sequential()
model.add(Dense(24, activation='relu', input_shape=(63,)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mse',
metrics=['mae'],
optimizer = optimizers.RMSprop(0.01))
history = model.fit(x_train, y_train_scaled, validation_data=(x_test, y_test_scaled), epochs=30, batch_size=200)
Epoch 1/30 134/134 [==============================] - 1s 3ms/step - loss: 0.7790 - mae: 0.6675 - val_loss: 0.7127 - val_mae: 0.6427 Epoch 2/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7366 - mae: 0.6468 - val_loss: 0.7079 - val_mae: 0.6423 Epoch 3/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7312 - mae: 0.6445 - val_loss: 0.7134 - val_mae: 0.6637 Epoch 4/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7268 - mae: 0.6426 - val_loss: 0.7257 - val_mae: 0.6527 Epoch 5/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7250 - mae: 0.6413 - val_loss: 0.7041 - val_mae: 0.6279 Epoch 6/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7223 - mae: 0.6396 - val_loss: 0.7047 - val_mae: 0.6112 Epoch 7/30 134/134 [==============================] - 0s 4ms/step - loss: 0.7203 - mae: 0.6381 - val_loss: 0.7054 - val_mae: 0.6369 Epoch 8/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7209 - mae: 0.6390 - val_loss: 0.7040 - val_mae: 0.6393 Epoch 9/30 134/134 [==============================] - 1s 4ms/step - loss: 0.7185 - mae: 0.6388 - val_loss: 0.7142 - val_mae: 0.6350 Epoch 10/30 134/134 [==============================] - 1s 5ms/step - loss: 0.7199 - mae: 0.6382 - val_loss: 0.7170 - val_mae: 0.6378 Epoch 11/30 134/134 [==============================] - 1s 5ms/step - loss: 0.7174 - mae: 0.6374 - val_loss: 0.7097 - val_mae: 0.6664 Epoch 12/30 134/134 [==============================] - 1s 5ms/step - loss: 0.7189 - mae: 0.6387 - val_loss: 0.7022 - val_mae: 0.6465 Epoch 13/30 134/134 [==============================] - 1s 4ms/step - loss: 0.7162 - mae: 0.6362 - val_loss: 0.6949 - val_mae: 0.6323 Epoch 14/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7158 - mae: 0.6369 - val_loss: 0.7084 - val_mae: 0.6257 Epoch 15/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7144 - mae: 0.6352 - val_loss: 0.7054 - val_mae: 0.6073 Epoch 16/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7119 - mae: 0.6342 - val_loss: 0.6975 - val_mae: 0.6185 Epoch 17/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7158 - mae: 0.6354 - val_loss: 0.7008 - val_mae: 0.6218 Epoch 18/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7144 - mae: 0.6364 - val_loss: 0.6996 - val_mae: 0.6171 Epoch 19/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7133 - mae: 0.6344 - val_loss: 0.7068 - val_mae: 0.6077 Epoch 20/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7132 - mae: 0.6350 - val_loss: 0.6900 - val_mae: 0.6195 Epoch 21/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7131 - mae: 0.6346 - val_loss: 0.6991 - val_mae: 0.6031 Epoch 22/30 134/134 [==============================] - 0s 2ms/step - loss: 0.7122 - mae: 0.6325 - val_loss: 0.7023 - val_mae: 0.6541 Epoch 23/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7109 - mae: 0.6346 - val_loss: 0.6959 - val_mae: 0.6205 Epoch 24/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7114 - mae: 0.6342 - val_loss: 0.6935 - val_mae: 0.6377 Epoch 25/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7106 - mae: 0.6341 - val_loss: 0.7018 - val_mae: 0.6406 Epoch 26/30 134/134 [==============================] - 0s 2ms/step - loss: 0.7106 - mae: 0.6339 - val_loss: 0.6999 - val_mae: 0.6185 Epoch 27/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7107 - mae: 0.6331 - val_loss: 0.7012 - val_mae: 0.6482 Epoch 28/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7119 - mae: 0.6349 - val_loss: 0.6976 - val_mae: 0.6461 Epoch 29/30 134/134 [==============================] - 0s 2ms/step - loss: 0.7093 - mae: 0.6322 - val_loss: 0.6921 - val_mae: 0.6372 Epoch 30/30 134/134 [==============================] - 0s 3ms/step - loss: 0.7102 - mae: 0.6323 - val_loss: 0.7012 - val_mae: 0.6437
y_pred = model.predict(x_test)
score = model.evaluate(x_test, y_test, verbose=1)
print(score)
209/209 [==============================] - 0s 1ms/step 209/209 [==============================] - 0s 1ms/step - loss: 3819805.2500 - mae: 1644.9724 [3819805.25, 1644.972412109375]
train_metrics = model.evaluate(x_train, y_train, verbose=0)
test_metrics = model.evaluate(x_test, y_test, verbose=0)
print(f"MSE Train : {train_metrics[0]} || MSE Test : {test_metrics[0]}")
print(f"MAE Train : {train_metrics[1]} || MAE Test : {test_metrics[1]}")
MSE Train : 3859731.25 || MSE Test : 3819805.25 MAE Train : 1648.3172607421875 || MAE Test : 1644.972412109375
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
_=plt.legend()
plt.xlabel("no epochs")
plt.ylabel("value loss")
Second Model :
model2 = Sequential()
model2.add(Dense(5, activation='relu', input_shape=(63,)))
model2.add(Dense(25, activation='relu'))
model2.add(Dropout(0.1))
model2.add(Dense(25, activation='relu'))
model2.add(Dense(1))
model2.compile(loss='mse',
metrics=['mae'],
optimizer = optimizers.RMSprop(0.003))
history2 = model2.fit(x_train, y_train_scaled, validation_data=(x_test, y_test_scaled), epochs=15, batch_size=200)
Epoch 1/15 134/134 [==============================] - 1s 4ms/step - loss: 0.7540 - mae: 0.6563 - val_loss: 0.7110 - val_mae: 0.6550 Epoch 2/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7272 - mae: 0.6424 - val_loss: 0.7022 - val_mae: 0.6357 Epoch 3/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7228 - mae: 0.6392 - val_loss: 0.6990 - val_mae: 0.6307 Epoch 4/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7194 - mae: 0.6367 - val_loss: 0.7118 - val_mae: 0.6134 Epoch 5/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7153 - mae: 0.6342 - val_loss: 0.6968 - val_mae: 0.6270 Epoch 6/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7150 - mae: 0.6343 - val_loss: 0.6936 - val_mae: 0.6212 Epoch 7/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7118 - mae: 0.6327 - val_loss: 0.6942 - val_mae: 0.6222 Epoch 8/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7114 - mae: 0.6328 - val_loss: 0.6966 - val_mae: 0.6106 Epoch 9/15 134/134 [==============================] - 0s 2ms/step - loss: 0.7101 - mae: 0.6303 - val_loss: 0.6913 - val_mae: 0.6251 Epoch 10/15 134/134 [==============================] - 0s 2ms/step - loss: 0.7084 - mae: 0.6313 - val_loss: 0.6970 - val_mae: 0.6479 Epoch 11/15 134/134 [==============================] - 0s 2ms/step - loss: 0.7072 - mae: 0.6299 - val_loss: 0.6978 - val_mae: 0.6082 Epoch 12/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7069 - mae: 0.6298 - val_loss: 0.6950 - val_mae: 0.6144 Epoch 13/15 134/134 [==============================] - 0s 2ms/step - loss: 0.7056 - mae: 0.6287 - val_loss: 0.6939 - val_mae: 0.6164 Epoch 14/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7057 - mae: 0.6286 - val_loss: 0.6996 - val_mae: 0.6441 Epoch 15/15 134/134 [==============================] - 0s 3ms/step - loss: 0.7045 - mae: 0.6290 - val_loss: 0.6957 - val_mae: 0.6322
MSE Train : 3859946.0 || MSE Test : 3820015.0 MAE Train : 1648.377685546875 || MAE Test : 1645.032470703125
cla_dl_news = m_news.copy()
cla_dl_news.drop(columns=["shares", "Class_shares2"], inplace=True)
x = cla_dl_news.loc[:, cla_dl_news.columns != "Class_shares1"]
y = np.ravel(cla_dl_news.loc[:, cla_dl_news.columns == "Class_shares1"])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) #data is already scaled
ohe_dl = OneHotEncoder(handle_unknown='ignore')
y_train = ohe_dl.fit_transform(y_train.reshape(-1, 1)).toarray()
y_test = ohe_dl.transform(y_test.reshape(-1, 1)).toarray()
model_reg1 = Sequential()
model_reg1.add(Dense(500, activation='relu', input_shape=(63,)))
model_reg1.add(Dense(100, activation='relu'))
model_reg1.add(Dense(50, activation='relu'))
model_reg1.add(Dense(2, activation='sigmoid'))
# Compile the model
model_reg1.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
reg1_history = model_reg1.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=60, batch_size=150)
Epoch 1/60 150/150 [==============================] - 2s 9ms/step - loss: 1374.1788 - accuracy: 0.5074 - val_loss: 1164.8440 - val_accuracy: 0.4699 Epoch 2/60 150/150 [==============================] - 1s 7ms/step - loss: 860.9224 - accuracy: 0.5180 - val_loss: 72.8253 - val_accuracy: 0.5575 Epoch 3/60 150/150 [==============================] - 1s 7ms/step - loss: 707.1960 - accuracy: 0.5110 - val_loss: 495.3256 - val_accuracy: 0.4706 Epoch 4/60 150/150 [==============================] - 1s 6ms/step - loss: 317.8339 - accuracy: 0.5144 - val_loss: 261.2305 - val_accuracy: 0.5299 Epoch 5/60 150/150 [==============================] - 1s 8ms/step - loss: 153.6503 - accuracy: 0.5068 - val_loss: 30.7872 - val_accuracy: 0.5654 Epoch 6/60 150/150 [==============================] - 1s 6ms/step - loss: 112.8508 - accuracy: 0.5162 - val_loss: 95.7697 - val_accuracy: 0.4716 Epoch 7/60 150/150 [==============================] - 1s 6ms/step - loss: 70.3102 - accuracy: 0.5160 - val_loss: 72.5418 - val_accuracy: 0.5370 Epoch 8/60 150/150 [==============================] - 1s 7ms/step - loss: 115.3843 - accuracy: 0.5192 - val_loss: 47.1550 - val_accuracy: 0.5401 Epoch 9/60 150/150 [==============================] - 1s 8ms/step - loss: 91.5014 - accuracy: 0.5188 - val_loss: 23.3417 - val_accuracy: 0.5222 Epoch 10/60 150/150 [==============================] - 1s 8ms/step - loss: 57.8234 - accuracy: 0.5189 - val_loss: 97.2591 - val_accuracy: 0.5344 Epoch 11/60 150/150 [==============================] - 1s 7ms/step - loss: 52.8480 - accuracy: 0.5230 - val_loss: 76.6300 - val_accuracy: 0.4902 Epoch 12/60 150/150 [==============================] - 1s 8ms/step - loss: 57.9529 - accuracy: 0.5224 - val_loss: 122.3198 - val_accuracy: 0.4719 Epoch 13/60 150/150 [==============================] - 1s 7ms/step - loss: 46.5792 - accuracy: 0.5199 - val_loss: 76.7960 - val_accuracy: 0.4712 Epoch 14/60 150/150 [==============================] - 1s 6ms/step - loss: 54.9994 - accuracy: 0.5172 - val_loss: 51.8316 - val_accuracy: 0.5400 Epoch 15/60 150/150 [==============================] - 1s 7ms/step - loss: 36.3061 - accuracy: 0.5261 - val_loss: 30.2682 - val_accuracy: 0.4825 Epoch 16/60 150/150 [==============================] - 1s 7ms/step - loss: 44.0931 - accuracy: 0.5210 - val_loss: 21.8375 - val_accuracy: 0.5513 Epoch 17/60 150/150 [==============================] - 1s 7ms/step - loss: 28.4663 - accuracy: 0.5219 - val_loss: 92.8242 - val_accuracy: 0.5300 Epoch 18/60 150/150 [==============================] - 1s 6ms/step - loss: 21.2641 - accuracy: 0.5267 - val_loss: 10.2366 - val_accuracy: 0.5707 Epoch 19/60 150/150 [==============================] - 1s 7ms/step - loss: 17.1555 - accuracy: 0.5290 - val_loss: 9.0999 - val_accuracy: 0.5419 Epoch 20/60 150/150 [==============================] - 1s 7ms/step - loss: 13.4933 - accuracy: 0.5335 - val_loss: 9.1912 - val_accuracy: 0.4834 Epoch 21/60 150/150 [==============================] - 1s 6ms/step - loss: 12.4037 - accuracy: 0.5211 - val_loss: 25.7567 - val_accuracy: 0.4697 Epoch 22/60 150/150 [==============================] - 1s 7ms/step - loss: 10.9389 - accuracy: 0.5177 - val_loss: 25.9628 - val_accuracy: 0.5242 Epoch 23/60 150/150 [==============================] - 1s 7ms/step - loss: 10.3263 - accuracy: 0.5177 - val_loss: 7.1340 - val_accuracy: 0.5399 Epoch 24/60 150/150 [==============================] - 1s 7ms/step - loss: 3.1374 - accuracy: 0.5367 - val_loss: 4.2200 - val_accuracy: 0.5384 Epoch 25/60 150/150 [==============================] - 1s 7ms/step - loss: 2.9842 - accuracy: 0.5414 - val_loss: 1.9322 - val_accuracy: 0.5702 Epoch 26/60 150/150 [==============================] - 1s 7ms/step - loss: 2.8577 - accuracy: 0.5307 - val_loss: 1.9819 - val_accuracy: 0.5724 Epoch 27/60 150/150 [==============================] - 1s 7ms/step - loss: 183.6258 - accuracy: 0.5146 - val_loss: 4.9934 - val_accuracy: 0.5428 Epoch 28/60 150/150 [==============================] - 1s 7ms/step - loss: 21.2318 - accuracy: 0.5000 - val_loss: 7.9930 - val_accuracy: 0.4654 Epoch 29/60 150/150 [==============================] - 1s 7ms/step - loss: 9.0180 - accuracy: 0.5070 - val_loss: 2.0575 - val_accuracy: 0.4926 Epoch 30/60 150/150 [==============================] - 1s 7ms/step - loss: 1.5628 - accuracy: 0.5167 - val_loss: 1.4754 - val_accuracy: 0.5312 Epoch 31/60 150/150 [==============================] - 1s 7ms/step - loss: 1.3090 - accuracy: 0.5212 - val_loss: 0.9409 - val_accuracy: 0.5348 Epoch 32/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7823 - accuracy: 0.5394 - val_loss: 0.7571 - val_accuracy: 0.5383 Epoch 33/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7368 - accuracy: 0.5417 - val_loss: 0.7681 - val_accuracy: 0.4799 Epoch 34/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7284 - accuracy: 0.5363 - val_loss: 0.6944 - val_accuracy: 0.5747 Epoch 35/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7235 - accuracy: 0.5442 - val_loss: 0.7784 - val_accuracy: 0.5306 Epoch 36/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7076 - accuracy: 0.5448 - val_loss: 0.7746 - val_accuracy: 0.5300 Epoch 37/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7122 - accuracy: 0.5486 - val_loss: 0.6874 - val_accuracy: 0.5750 Epoch 38/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7159 - accuracy: 0.5432 - val_loss: 0.6943 - val_accuracy: 0.5771 Epoch 39/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7076 - accuracy: 0.5466 - val_loss: 0.6943 - val_accuracy: 0.5724 Epoch 40/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7114 - accuracy: 0.5448 - val_loss: 0.7520 - val_accuracy: 0.4729 Epoch 41/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7041 - accuracy: 0.5446 - val_loss: 0.7024 - val_accuracy: 0.5602 Epoch 42/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7028 - accuracy: 0.5498 - val_loss: 0.7317 - val_accuracy: 0.5324 Epoch 43/60 150/150 [==============================] - 1s 6ms/step - loss: 38.3143 - accuracy: 0.5407 - val_loss: 250.1640 - val_accuracy: 0.5289 Epoch 44/60 150/150 [==============================] - 1s 6ms/step - loss: 62.3138 - accuracy: 0.5005 - val_loss: 5.5336 - val_accuracy: 0.5290 Epoch 45/60 150/150 [==============================] - 1s 6ms/step - loss: 1.1186 - accuracy: 0.5058 - val_loss: 0.7390 - val_accuracy: 0.4702 Epoch 46/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7203 - accuracy: 0.5152 - val_loss: 0.8269 - val_accuracy: 0.5306 Epoch 47/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7363 - accuracy: 0.5090 - val_loss: 0.9496 - val_accuracy: 0.4710 Epoch 48/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7246 - accuracy: 0.5063 - val_loss: 0.7385 - val_accuracy: 0.4752 Epoch 49/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7078 - accuracy: 0.5153 - val_loss: 0.7103 - val_accuracy: 0.4750 Epoch 50/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7058 - accuracy: 0.5211 - val_loss: 0.6961 - val_accuracy: 0.5417 Epoch 51/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7020 - accuracy: 0.5283 - val_loss: 0.6984 - val_accuracy: 0.5440 Epoch 52/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7004 - accuracy: 0.5309 - val_loss: 0.7235 - val_accuracy: 0.5305 Epoch 53/60 150/150 [==============================] - 1s 7ms/step - loss: 0.7026 - accuracy: 0.5310 - val_loss: 0.6944 - val_accuracy: 0.5410 Epoch 54/60 150/150 [==============================] - 1s 6ms/step - loss: 0.7154 - accuracy: 0.5219 - val_loss: 0.6938 - val_accuracy: 0.5492 Epoch 55/60 150/150 [==============================] - 1s 7ms/step - loss: 0.6941 - accuracy: 0.5462 - val_loss: 0.6898 - val_accuracy: 0.5514 Epoch 56/60 150/150 [==============================] - 1s 7ms/step - loss: 22.4201 - accuracy: 0.5273 - val_loss: 1.4550 - val_accuracy: 0.5319 Epoch 57/60 150/150 [==============================] - 1s 7ms/step - loss: 1.3315 - accuracy: 0.5312 - val_loss: 1.8703 - val_accuracy: 0.5300 Epoch 58/60 150/150 [==============================] - 1s 7ms/step - loss: 0.8320 - accuracy: 0.5299 - val_loss: 0.6914 - val_accuracy: 0.5301 Epoch 59/60 150/150 [==============================] - 1s 7ms/step - loss: 0.6919 - accuracy: 0.5300 - val_loss: 0.6922 - val_accuracy: 0.5304 Epoch 60/60 150/150 [==============================] - 1s 7ms/step - loss: 0.6916 - accuracy: 0.5304 - val_loss: 0.6927 - val_accuracy: 0.5300
train_metrics = model_reg1.evaluate(x_train, y_train, verbose=0)
test_metrics = model_reg1.evaluate(x_test, y_test, verbose=0)
print(f"Categorical Crossentropy Train : {train_metrics[0]} || Accuracy Train : {train_metrics[1]}")
print(f"Categorical Crossentropy Test : {test_metrics[0]} || Accuracy Test : {test_metrics[1]}")
Categorical Crossentropy Train : 0.6920933127403259 || Accuracy Train : 0.5306687951087952 Categorical Crossentropy Test : 0.6927127838134766 || Accuracy Test : 0.5299509763717651
plt.plot(reg1_history.history['loss'], label='train')
plt.plot(reg1_history.history['val_loss'], label='test')
_=plt.legend()
plt.xlabel("no epochs")
plt.ylabel("value loss")
y_pred_cla = model_reg1.predict(x_test)
345/345 [==============================] - 1s 2ms/step
ohe_dl.inverse_transform(y_pred_cla)
array([['low'],
['low'],
['low'],
...,
['low'],
['low'],
['low']], dtype=object)cla_mul_dl_news = m_news.copy()
cla_mul_dl_news.drop(columns=["shares", "Class_shares1"], inplace=True)
x = cla_mul_dl_news.loc[:, cla_mul_dl_news.columns != "Class_shares2"]
y = np.ravel(cla_mul_dl_news.loc[:, cla_mul_dl_news.columns == "Class_shares2"])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) #data is already scaled
ohe_dl_cla_mul = OneHotEncoder(handle_unknown='ignore')
y_train = ohe_dl_cla_mul.fit_transform(y_train.reshape(-1, 1)).toarray()
y_test = ohe_dl_cla_mul.transform(y_test.reshape(-1, 1)).toarray()
model_reg2 = Sequential()
model_reg2.add(Dense(8, input_shape=(63,), activation='relu'))
model_reg2.add(Dense(4, activation='softmax'))
# Compile the model
model_reg2.compile(loss='categorical_crossentropy',
metrics=['accuracy'],
optimizer=optimizers.Adam(0.004))
reg2_history = model_reg2.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=15, batch_size=100)
Epoch 1/15 224/224 [==============================] - 1s 3ms/step - loss: 11334.0996 - accuracy: 0.2618 - val_loss: 362.2202 - val_accuracy: 0.2966 Epoch 2/15 224/224 [==============================] - 1s 3ms/step - loss: 653.9594 - accuracy: 0.2715 - val_loss: 471.1823 - val_accuracy: 0.2748 Epoch 3/15 224/224 [==============================] - 1s 3ms/step - loss: 478.6538 - accuracy: 0.2712 - val_loss: 228.4968 - val_accuracy: 0.1712 Epoch 4/15 224/224 [==============================] - 1s 3ms/step - loss: 487.4342 - accuracy: 0.2732 - val_loss: 303.9554 - val_accuracy: 0.2904 Epoch 5/15 224/224 [==============================] - 1s 3ms/step - loss: 409.5179 - accuracy: 0.2691 - val_loss: 478.6061 - val_accuracy: 0.2821 Epoch 6/15 224/224 [==============================] - 1s 3ms/step - loss: 480.5945 - accuracy: 0.2714 - val_loss: 785.5074 - val_accuracy: 0.2776 Epoch 7/15 224/224 [==============================] - 1s 3ms/step - loss: 369.5719 - accuracy: 0.2721 - val_loss: 268.8868 - val_accuracy: 0.1563 Epoch 8/15 224/224 [==============================] - 1s 3ms/step - loss: 433.7944 - accuracy: 0.2748 - val_loss: 303.9224 - val_accuracy: 0.2835 Epoch 9/15 224/224 [==============================] - 1s 3ms/step - loss: 428.7841 - accuracy: 0.2680 - val_loss: 177.0049 - val_accuracy: 0.3050 Epoch 10/15 224/224 [==============================] - 1s 3ms/step - loss: 385.2944 - accuracy: 0.2711 - val_loss: 723.3517 - val_accuracy: 0.2841 Epoch 11/15 224/224 [==============================] - 1s 3ms/step - loss: 337.7956 - accuracy: 0.2682 - val_loss: 183.5805 - val_accuracy: 0.3362 Epoch 12/15 224/224 [==============================] - 1s 3ms/step - loss: 318.3657 - accuracy: 0.2703 - val_loss: 255.8561 - val_accuracy: 0.2791 Epoch 13/15 224/224 [==============================] - 1s 3ms/step - loss: 350.2973 - accuracy: 0.2685 - val_loss: 290.1129 - val_accuracy: 0.2910 Epoch 14/15 224/224 [==============================] - 1s 3ms/step - loss: 293.9024 - accuracy: 0.2682 - val_loss: 197.1841 - val_accuracy: 0.3001 Epoch 15/15 224/224 [==============================] - 1s 3ms/step - loss: 277.3061 - accuracy: 0.2739 - val_loss: 465.7227 - val_accuracy: 0.2814
train_metrics = model_reg2.evaluate(x_train, y_train, verbose=0)
test_metrics = model_reg2.evaluate(x_test, y_test, verbose=0)
print(f"Categorical Crossentropy Train : {train_metrics[0]} || Accuracy Train : {train_metrics[1]}")
print(f"Categorical Crossentropy Test : {test_metrics[0]} || Accuracy Test : {test_metrics[1]}")
Categorical Crossentropy Train : 461.2908630371094 || Accuracy Train : 0.288581907749176 Categorical Crossentropy Test : 465.7228698730469 || Accuracy Test : 0.2814485430717468
plt.plot(reg2_history.history['loss'], label='train')
plt.plot(reg2_history.history['val_loss'], label='test')
_=plt.legend()
plt.xlabel("no epochs")
plt.ylabel("value loss")
y_pred_cla_mul = model_reg2.predict(x_test)
345/345 [==============================] - 0s 1ms/step
ohe_dl_cla_mul.inverse_transform(y_pred_cla_mul)
array([['low'],
['low'],
['low'],
...,
['low'],
['low'],
['low']], dtype=object)filename = 'model_api.sav'
pickle.dump(grid_search_ABC, open(filename, 'wb'))
news_api = pd.read_csv('/content/drive/MyDrive/pyhton/news_api.csv')
news_api.drop(columns=['Unnamed: 0'], inplace = True)
news_api.head(5)
threshold = 0.7
analysis_corr = news_api.corr()
upper_tri = analysis_corr.where(np.triu(np.ones(analysis_corr.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
to_drop = to_drop[:12] #we don't want to suppress vectorization of chanels it is perfectly normal for them to be correlated
to_drop = to_drop[:5] + to_drop[8:] #we keep latent dirichlet allocation topics
print(to_drop)
news_api.drop(columns=to_drop, inplace=True)
news_api['Class_shares1'] = Class_shares1
news_api['Class_shares2'] = Class_shares2
del news_api['Class_shares2']
news_api.to_csv('news_api.csv')
x = news_api.loc[:, news_api.columns != "Class_shares1"]
y = news_api.loc[:, news_api.columns == "Class_shares1"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
y_train = np.where(y_train == 'high',1,0)
y_test = np.where(y_test == 'high',1,0)
object = StandardScaler()
x_train_scale = object.fit_transform(x_train)
x_test_scale = object.fit_transform(x_test)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [200, 416],
"algorithm" : ['SAMME']
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = 4)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')
grid_search_ABC.fit(x_train_scale, y_train)
grid_search_ABC.score(x_test_scale, y_test)
filename = 'model_api.sav'
loaded_model = pickle.load(open(filename, 'rb'))
def prediction(l):
result = loaded_model.predict(l)
return result
def stringtodf(s):
l = s.split('/')
l = [float(z) for z in l]
temp = pd.read_csv('news_api.csv')
del temp['Class_shares1']
del temp['Unnamed: 0']
data = dict(zip(temp.columns,l))
df = pd.DataFrame(columns = temp.columns)
df.loc[0] = pd.Series(data)
object = StandardScaler()
object.fit(temp)
df.loc[0] = object.transform(df)
return df
app1 = Flask(__name__)
@app1.route('/status', methods=['GET'])
def a_live():
return "Alive!"
@app1.route('/predict', methods=['GET'])
def predict():
args = request.args
info = args['info']
line = stringtodf(info)
val = prediction(line)
if val[0] == 1 :
return 'high'
if val[0] == 0 :
return 'low'
#app1.run(debug=True ,host='0.0.0.0', port=8080)
Link to our webapp : https://samuelpariente-online-news-popularity-webappwebapp-s3npdr.streamlit.app/
news_api = pd.read_csv('/content/drive/MyDrive/pyhton/news_api.csv')
news_api.drop(columns=['Unnamed: 0'], inplace = True)
news_api.head(5)
threshold = 0.7
analysis_corr = news_api.corr()
upper_tri = analysis_corr.where(np.triu(np.ones(analysis_corr.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
to_drop = to_drop[:12] #we don't want to suppress vectorization of chanels it is perfectly normal for them to be correlated
to_drop = to_drop[:5] + to_drop[8:] #we keep latent dirichlet allocation topics
print(to_drop)
news_api.drop(columns=to_drop, inplace=True)
news_api['Class_shares1'] = Class_shares1
news_api['Class_shares2'] = Class_shares2
del news_api['Class_shares2']
news_api.to_csv('news_api.csv')
x = news_api.loc[:, news_api.columns != "Class_shares1"]
y = news_api.loc[:, news_api.columns == "Class_shares1"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)
y_train = np.where(y_train == 'high',1,0)
y_test = np.where(y_test == 'high',1,0)
object = StandardScaler()
x_train_scale = object.fit_transform(x_train)
x_test_scale = object.fit_transform(x_test)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [200, 416],
"algorithm" : ['SAMME']
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = 4)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')
grid_search_ABC.fit(x_train_scale, y_train)
grid_search_ABC.score(x_test_scale, y_test)
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 09:42:42 2022
@author: samue
"""
import streamlit as st
import requests
st.title('Live Prediction')
p1 = st.slider('n_tokens_title', 0, 30 ,10, step = 1)
p2 = st.slider('n_tokens_content', 0, 10000,564, step = 1)
p3 = st.slider('n_unique_tokens', 0.00, 1.00,0.54, step = 0.01)
p4 = st.slider('n_non_stop_words', 0.00, 1.00,1.00, step = 0.01)
p5 = st.slider('num_hrefs', 0, 350,11, step = 1)
p6 = st.slider('num_self_hrefs', 0, 150,3, step = 1)
p7 = st.slider('num_imgs', 0, 100,5, step = 1)
p8 = st.slider('num_videos', 0, 150,1, step = 1)
p9 = st.slider('average_token_length', 0.00, 10.00,4.6, step = 0.01)
p10 = st.slider('num_keywords', 0.00, 10.00,7.2, step = 0.01)
p11 = st.selectbox(
'How would you like to be contacted?',
('lifestyle', 'entertainment', 'bus','socmed','tech','world'))
st.write('You selected:', p11)
p12 = st.slider('kw_min_min', 0, 300000,1150, step = 1)
p122 = st.slider('kw_max_min', 0, 300000,1152, step = 1)
p13 = st.slider('kw_min_max', 0, 843300,13062, step = 1)
p14 = st.slider('kw_max_max', 0, 843300,750683, step = 1)
p15 = st.slider('kw_avg_max', 0,843300,255064, step = 1)
p16 = st.slider('kw_min_avg', 0, 3613, 1094,step = 1)
p17 =st.slider('kw_max_avg', 0, 298400,5582, step = 1)
p18 = st.slider('self_reference_min_shares', 0, 850000,4131, step = 1)
p19 = st.slider('self_reference_max_shares', 0, 850000,10585, step = 1)
p20 = st.selectbox(
'How would you like to be contacted?',
('monday', 'tuesday', 'wednesday','thursday','friday','saturday','sunday'))
st.write('You selected:', p20)
p21 = st.slider('LDA_00', 0.00, 1.00,0.18, step = 0.01)
p22 = st.slider('LDA_01', 0.00, 1.00,0.14, step = 0.01)
p23 = st.slider('LDA_02', 0.00, 1.00,0.21, step = 0.01)
p24 = st.slider('LDA_03', 0.00, 1.00,0.21, step = 0.01)
p25 = st.slider('LDA_04', 0.00, 1.00,0.23, step = 0.01)
p26 = st.slider('global_subjectivity', 0.00, 1.00,0.45, step = 0.01)
p27 = st.slider('global_sentiment_polarity',0.00, 1.00,0.12, step = 0.01)
p28 = st.slider('global_rate_positive_words', 0.00, 0.20,0.04, step = 0.01)
p29 = st.slider('global_rate_negative_words,', 0.00, 0.20,0.01, step = 0.01)
p30 = st.slider('avg_positive_polarity', 0.00, 1.00,0.36, step = 0.01)
p31 =st.slider('min_positive_polarity', 0.00, 1.00,0.09, step = 0.01)
p32 = st.slider('max_positive_polarity', 0.00, 1.00,0.77, step = 0.01)
p33 = st.slider('avg_negative_polarity', -1.00, 0.00,-0.26, step = 0.01)
p34 = st.slider('max_negative_polarity', -1.00, 0.00,-0.11, step = 0.01)
p35 = st.slider('title_subjectivity', 0.00, 1.00,0.28, step = 0.01)
p36 =st.slider('title_sentiment_polarity', 0.00, 1.00,0.06, step = 0.01)
p37 = st.slider('abs_title_subjectivity', 0.00, 1.00,0.34, step = 0.01)
p38 = st.slider('NbVisit', 0.00, 15.00,7.41,step = 0.01)
if p20 == 'monday':
p20 = '1/0/0/0/0/0/0'
if p20 == 'tuesday':
p20 = '0/1/0/0/0/0/0'
if p20 == 'wednesday':
p20 = '0/0/1/0/0/0/0'
if p20 == 'thursday':
p20 = '0/0/0/1/0/0/0'
if p20 == 'friday':
p20 = '0/0/0/0/1/0/0'
if p20 == 'saturday':
p20 = '0/0/0/0/0/1/0'
if p20 == 'sunday':
p20 = '0/0/0/0/0/0/1'
if p11 == 'lifestyle':
p11 = '1/0/0/0/0/0'
if p11 == 'entertainment':
p11 = '0/1/0/0/0/0'
if p11 == 'bus':
p11 = '0/0/1/0/0/0'
if p11 == 'socmed':
p11 = '0/0/0/1/0/0'
if p11 == 'tech':
p11 = '0/0/0/0/1/0'
if p11 == 'world':
p11 = '0/0/0/0/0/1'
info1 = str(p1)+'/'+str(p2)+'/'+str(p3)+'/'+str(p4)+'/'+str(p5)+'/'+str(p6)+'/'+str(p7)+'/'+str(p8)+'/'+str(p9)+'/'+str(p10)+'/'+str(p11)+'/'+str(p12)+'/'+str(p122)+'/'+str(p13)+'/'+str(p14)+'/'+str(p15)+'/'+str(p16)+'/'+str(p17)+'/'+str(p18)+'/'+str(p19)+'/'+str(p20)+'/'+str(p21)+'/'+str(p22)+'/'+str(p23)+'/'+str(p24)+'/'+str(p25)+'/'+str(p26)+'/'+str(p27)+'/'+str(p28)+'/'+str(p29)+'/'+str(p30)+'/'+str(p31)+'/'+str(p32)+'/'+str(p33)+'/'+str(p34)+'/'+str(p35)+'/'+str(p36)+'/'+str(p37)+'/'+str(p38)
info = 'http://3.80.212.138:8080/predict?info='+info1
if st.button('Predict !'):
a = requests.get(info)
st.write('Your article gona whave '+a.text+' shares' )
!jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/Projet-Final-[V2.9].ipynb'
pol_subj_wr_shares = v_news[["global_subjectivity", "avg_positive_polarity", "avg_negative_polarity", "shares"]]
descrete_pol_subj_wr_shares = pd.DataFrame()
bins_v = [round(x, 1) for x in np.arange(0, 1.1, 0.1)]
labels_v = [round(i, 1) for i in np.arange(0, 1, 0.1)]
bins_v_neg = [round(x, 1) for x in np.arange(-1, 0.1, 0.1)]
labels_v_neg = [round(i, 1) for i in np.arange(-1, 0, 0.1)]
descrete_pol_subj_wr_shares["global_subjectivity"] = pd.cut(pol_subj_wr_shares["global_subjectivity"],
bins = bins_v,
labels = labels_v)
descrete_pol_subj_wr_shares["avg_positive_polarity"] = pd.cut(pol_subj_wr_shares["avg_positive_polarity"],
bins = bins_v,
labels = labels_v)
descrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v,
labels = labels_v)
descrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v_neg,
labels = labels_v_neg)
descrete_pol_subj_wr_shares["shares"] = pol_subj_wr_shares["shares"]
discrete_table_pol_subj_wr_shares = pd.pivot_table(descrete_pol_subj_wr_shares,
index="global_subjectivity",
columns="avg_positive_polarity",
values="shares",
aggfunc="sum").fillna(0)
dx, dy = 0.8, 0.8
fig = plt.figure(figsize=(10,6))
_=ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)
xpos=np.arange(discrete_table_pol_subj_wr_shares.shape[0])
ypos=np.arange(discrete_table_pol_subj_wr_shares.shape[1])
ax.set_xticks(xpos + dx/2)
ax.set_yticks(ypos + dy/2)
xpos, ypos = np.meshgrid(xpos, ypos)
xpos = xpos.flatten()
ypos = ypos.flatten()
zpos=np.zeros(discrete_table_pol_subj_wr_shares.shape).flatten()
dz = discrete_table_pol_subj_wr_shares.values.ravel()
offset = dz + np.abs(dz.min())
fracs = offset.astype(float)/offset.max()
norm = plt_colors.Normalize(fracs.min(), fracs.max())
colors = cm.jet(norm(fracs))
ax.bar3d(xpos,ypos,zpos,dx,dy,dz, color=colors)
# put the column / index labels
ax.w_yaxis.set_ticklabels(discrete_table_pol_subj_wr_shares.columns)
ax.w_xaxis.set_ticklabels(discrete_table_pol_subj_wr_shares.index)
# name the axes
_=ax.set_xlabel('Global Subjectivity')
_=ax.set_ylabel('Average Positive Polarity')
_=ax.set_zlabel('Shares')
plt.autoscale(enable=True, axis='both', tight=True)
plt.grid()
plt.show()
pol_subj_wr_shares = v_news[["global_subjectivity", "avg_positive_polarity", "avg_negative_polarity", "shares"]]
discrete_pol_subj_wr_shares = pd.DataFrame()
bins_v = [round(x, 1) for x in np.arange(0, 1.1, 0.1)]
labels_v = [round(i, 1) for i in np.arange(0, 1, 0.1)]
bins_v_neg = [round(x, 1) for x in np.arange(-1, 0.1, 0.1)]
labels_v_neg = [round(i, 1) for i in np.arange(-1, 0, 0.1)]
discrete_pol_subj_wr_shares["global_subjectivity"] = pd.cut(pol_subj_wr_shares["global_subjectivity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_positive_polarity"] = pd.cut(pol_subj_wr_shares["avg_positive_polarity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v,
labels = labels_v)
discrete_pol_subj_wr_shares["avg_negative_polarity"] = pd.cut(pol_subj_wr_shares["avg_negative_polarity"],
bins = bins_v_neg,
labels = labels_v_neg)
discrete_pol_subj_wr_shares["shares"] = v_news["shares"]
discrete_pol_subj_wr_shares.reset_index(drop=True, inplace = True)
source = ColumnDataSource(descrete_pol_subj_wr_shares)
# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors,
low=discrete_pol_subj_wr_shares.shares.min(),
high=discrete_pol_subj_wr_shares.shares.max())
p = figure(width=800, height=300, title="US unemployment 1948—2016")
p.rect(x="global_subjectivity", y="avg_positive_polarity", width=1, height=1, source=source,
line_color=None, fill_color=transform('shares', mapper))
color_bar = ColorBar(color_mapper=mapper,
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%d%%"))
p.add_layout(color_bar, 'right')
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 1.0
_=show(p, notebook_handle=True)
url = 'https://raw.githubusercontent.com/werowe/logisticRegressionBestModel/master/KidCreative.csv'
data = pd.read_csv(url, delimiter=',')
data